%%%%%%load reference%%%%%%
% the first is PAO1, the standard seq
[fpvA_heads,fpvA_seqs]=fastaread([datapath,'reference_FpvA.fasta']);
[fpvB_heads,fpvB_seqs]=fastaread([datapath,'reference_FpvB.fasta']);
[othersids_heads,othersids_seqs]=fastaread([datapath,'sid18_receptor.fasta']);
othersids_heads=othersids_heads';
othersids_seqs=othersids_seqs';
refreceptors.seqs=[fpvA_seqs;fpvB_seqs;othersids_seqs];
refreceptors.names=[fpvA_heads;fpvB_heads;othersids_heads];
refreceptors.num=length(refreceptors.seqs);
refreceptors.len=zeros(refreceptors.num,1);
refreceptors.len(i)=length(refreceptors.seqs{i});
refreceptors.shortnames=[];
longname=refreceptors.names{i};
if ~(i>(length(fpvA_heads)+length(fpvB_heads)))
strlist=strsplit(longname,'|');
if i>length(fpvA_heads)%fpvB
shortname=['fpvB |',strlist{end}];
refreceptors.typeid(i,1)=2;
shortname=['fpvA |',strlist{end}];
refreceptors.typeid(i,1)=1;
strlist=strsplit(longname,' ');
refreceptors.typeid(i,1)=0;
shortname(shortname=='_')='-';
refreceptors.shortnames{i,1}=shortname;
refreceptors.seqlenrange=[750,850];
refreceptors.tocheckseq=find(~strcmp('pseudobactin',refreceptors.shortnames) & ~(refreceptors.len<refreceptors.seqlenrange(1)) & ~(refreceptors.len>refreceptors.seqlenrange(2)));
receptor_hmm_scorem=size(3,size(refreceptors.mulalign_tocheck,1));
receptor_hmm_se=size(3,length(refreceptors.tocheckseq),2);
mulalign_countisdomain=zeros(3,size(refreceptors.mulalign_tocheck,2));
for i=1:length(refreceptors.tocheckseq)
seq=refreceptors.seqs{refreceptors.tocheckseq(i)};
mul_thisseq_nogap=(refreceptors.mulalign_tocheck(i,:)~='-');
nongapcum=cumsum(mul_thisseq_nogap);
for m=1:recdomain_hmms.num
[phaligncore, alignment, pointer] = hmmprofalign(recdomain_hmms.models{m}, seq);
receptor_hmm_scorem(m,i)=phaligncore;
receptor_hmm_seqs{m,i}=alignment;
receptor_hmm_se(m,i,1)=min(pointer);
receptor_hmm_se(m,i,2)=max(pointer);
startponmul=find(nongapcum==min(pointer), 1 );
stopponmul=find(nongapcum==max(pointer), 1 );
mulalign_countisdomain(m,startponmul:stopponmul)=mulalign_countisdomain(m,startponmul:stopponmul)+1;
for i=1:length(refreceptors.shortnames)
if refreceptors.typeid(i)==1
elseif refreceptors.typeid(i)==2
shortername{i,1}=refreceptors.shortnames{i};
thresholds_list=[25,50,80];
for m=1:recdomain_hmms.num
subplot(recdomain_hmms.num,1,m);
bar(receptor_hmm_scorem(m,refreceptors.optleaforder));
plot([0.5,length(refreceptors.optleaforder)+1.5],thresholds_list(m)*[1,1],'r --')
xlim([0.5,length(refreceptors.optleaforder)+0.5])
xticks(1:length(refreceptors.tocheckseq))
xticklabels(shortername(refreceptors.tocheckseq(refreceptors.optleaforder)));
% conservation and fragmentation of the refseq
threshhold_conservescore=0.9;
conservescore=(max(ref_Score)-ref_Score).*(1-refreceptors.gapfreq);
% does the seq exist in the first sequence(PAO fpv1);
existinPao1=find(refreceptors.mulalign_tocheck(1,:)~='-');
templist=find(conservescore>threshhold_conservescore*max(conservescore));
seperateposi=templist(1);
for i=2:(length(templist)-1)
if templist(i)-templist(i-1)>0
seperateposi(end+1)=templist(i);
seperateposi=unique([1,seperateposi,length(conservescore)]);
% locate the position in PAO1
mullocinpao1=cumsum(refreceptors.mulalign_tocheck(1,:)~='-');
sep_se=[seperateposi(1:(end-1))',seperateposi(2:end)'-1];
sep_se(end,2)=length(conservescore);
sepscore_list=zeros(size(sep_se,1),1);
mulisAlist=find(refreceptors.typeid(refreceptors.tocheckseq)==1);
otherlist=find(refreceptors.typeid(refreceptors.tocheckseq)~=1);
scorem=squareform(seqpdist(refreceptors.mulalign_tocheck(:,sp:ep),'Method','p-distance'));
dist_A_A=(scorem(mulisAlist,mulisAlist));
dist_A_other=scorem(mulisAlist,otherlist);
sepscore_A=(mean(dist_A_other(:))-mean(dist_A_A(:)))/std(dist_A_A(:));
sepscore_list(i)=sepscore_A;
subplot(subplotnum,1,1:(subplotnum-1))
xlist=1:length(existinPao1);
plot(xlist,conservescore(existinPao1),'k');
scatter(mullocinpao1(seperateposi),conservescore(seperateposi),30,'k','filled');
colortable=colormap(jet);
mincolorv=0.6;maxcolorv=1.2;
splist=intersect(sp:ep,existinPao1);
splist(end)=splist(end)+1;
sepscore_A=sepscore_list(i);
colorid=min(size(colortable,1),max(1,round(size(colortable,1)*(sepscore_A-mincolorv)/(maxcolorv-mincolorv))));
fh=fill(mullocinpao1([splist(1),splist,splist(end)]),[0,conservescore(splist),0],colortable(colorid,:));
% mark the poisition for domains
for i=1:size(refreceptors.pao1_ses)
startp=refreceptors.pao1_ses(i,1);
stopp=refreceptors.pao1_ses(i,2);
colorcode=refreceptors.color(i,:);
annostr=refreceptors.annostr{i};
patch([startp,stopp,stopp,startp],-0.1+[-1,-1,0,0]*height,colorcode);
text(startp,-0.5,annostr)
ch.TickLabels= strsplit(sprintf('%0.1f ',[mincolorv,maxcolorv]));
xlim([0,length(existinPao1)+1])
ylim([-1.2,max(conservescore)+0.2])
ylabel('conservation score');
subplot(subplotnum,1,subplotnum)
% mark the three domain annotation
barheight=sum(mulalign_countisdomain,1);
imagesc(1:length(barheight),-1.6*ones(size(barheight)),barheight)
xlim([0,length(existinPao1)+1])
select_topregnum=2;gapthresh=0.5;
%raw=load('./output/firstversion_fpvApfammodel.mat');% saved for last time
[~,order]=sort(sepscore_list,'descend');
sptables=sep_se(order(1:select_topregnum),1:2);
mulalign=refreceptors.mulalign_tocheck(:,sp:ep);
gapfreq=sum(mulalign=='-',1)/size(mulalign,1);
titlestr=sprintf('%d(%s)-%d(%s)',mullocinpao1(sp),refreceptors.mulalign_tocheck(1,sp),mullocinpao1(ep),refreceptors.mulalign_tocheck(1,ep));
scorem=squareform(seqpdist(mulalign,'Method','p-distance'));
recordscorem(:,:,i)=scorem;
imagesc(scorem(refreceptors.optleaforder,refreceptors.optleaforder));
yticks(1:length(refreceptors.tocheckseq))
yticklabels(shortername(refreceptors.tocheckseq(refreceptors.optleaforder)));
plot([0,size(scorem,1)+1],(sum(refreceptors.typeid(refreceptors.tocheckseq(refreceptors.optleaforder))==2)+0.5)*[1,1],'w')
plot([0,size(scorem,1)+1],(sum(refreceptors.typeid(refreceptors.tocheckseq(refreceptors.optleaforder))>0)+0.5)*[1,1],'w');
plot((sum(refreceptors.typeid(refreceptors.tocheckseq(refreceptors.optleaforder))==2)+0.5)*[1,1],[0,size(scorem,1)+1],'w')
plot((sum(refreceptors.typeid(refreceptors.tocheckseq(refreceptors.optleaforder))>0)+0.5)*[1,1],[0,size(scorem,1)+1],'w');
%%%%%%%%%%%%%%% get a model
touselist=find(refreceptors.typeid(refreceptors.tocheckseq)==1);
touselist=find((refreceptors.typeid==2));
firstmodel = hmmprofstruct(sum(gapfreq<gapthresh));
fpvA_pfammodels.num=fpvA_pfammodels.num+1;
fpvA_pfammodels.models{fpvA_pfammodels.num}= hmmprofestimate(firstmodel, mulalign(touselist,:));
fpvA_pfammodels.fpv_ABother=refreceptors.typeid(refreceptors.tocheckseq);
fpvA_pfammodels.mulalign{fpvA_pfammodels.num}=mulalign;
sepscore_A=sepscore_list(order(i));
colorid=min(size(colortable,1),max(1,round(size(colortable,1)*(sepscore_A-mincolorv)/(maxcolorv-mincolorv))));
fpvA_hmm_scorem=size(fpvA_pfammodels.num,size(refreceptors.mulalign_tocheck,1));
fpvA_hmm_se=size(3,size(refreceptors.mulalign_tocheck,1),2);
for i=1:size(refreceptors.mulalign_tocheck,1)
seq=refreceptors.seqs{refreceptors.tocheckseq(i)};
for m=1:fpvA_pfammodels.num
[phaligncore, alignment, pointer] = hmmprofalign(fpvA_pfammodels.models{m}, seq);
fpvA_hmm_scorem(m,i)=phaligncore;
fpvA_hmm_seqs{m,i}=alignment;
fpvA_hmm_se(m,i,1)=min(pointer);
fpvA_hmm_se(m,i,2)=max(pointer);
list=find(refreceptors.typeid(refreceptors.tocheckseq)==(i-1));
lh(i)=scatter(fpvA_hmm_scorem(1,list),fpvA_hmm_scorem(2,list),30,colorcodes(i,:),'filled');
plot(pickthresh(1)*[1,1],[1,max(fpvA_hmm_scorem(2,:))],'-- k')
plot([pickthresh(1),max(fpvA_hmm_scorem(1,:))],pickthresh(2)*[1,1],'-- k')
ylim([0,max(fpvA_hmm_scorem(2,:))+1])
xlim([0,max(fpvA_hmm_scorem(1,:))+1])
legend(lh,{'other receptors','fpvA','fpvB'},'Location','northwest')
% define distance to nrps pep region
pickedrec_disttonrps=(pickedrec.recs_dist_threetypes(309:end,3));
nanarea=find(isnan(pickedrec.recs_dist_threetypes(309:end,3)) &~isnan(pickedrec.recs_dist_threetypes(309:end,2)));pickedrec_disttonrps(nanarea)=pickedrec.recs_dist_threetypes(nanarea,2);% optional
% three classes of receptors
rec_candfpva=find(pickedrec.fpvA_hmm_scorem(309:end,1)>pickthresh(1) & ~(pickedrec.fpvA_hmm_scorem(309:end,2)>pickthresh(2)));
rec_candfpvb=find(pickedrec.fpvA_hmm_scorem(309:end,1)>pickthresh(1) & (pickedrec.fpvA_hmm_scorem(309:end,2)>pickthresh(2)));
rec_candother=setdiff((1:(pickedrec.num-308))',[rec_candfpva;rec_candfpvb]);
fprintf('%d\t fpvA candidates\n',length(rec_candfpva))
fprintf('%d\t fpvB candidates\n',length(rec_candfpvb))
fprintf('%d\t other candidates\n',length(rec_candother))
[node_groupid,totalgnum,changedleaforder_bygroups] = Cluster_byconnection(pickedrec_fpva.seqdist,test_distthresh,optleaforderfpva);
rec_candother=setdiff((1:(pickedrec.num-308))',[rec_candfpva;rec_candfpvb]);
a=ismember(changedleaforder_bygroups,labnum);
changedleaforder_bygroups_1=changedleaforder_bygroups(b);
imagesc(refdistm_fpva(refreceptors.optleaforder,changedleaforder_bygroups_1));
thisg_members=find(node_groupid(changedleaforder_bygroups_1)==g);
groupsize(g)=length(thisg_members);
plot((thisg_members(end)+1)*[1,1],[0,length(changedleaforder_bygroups_1)],'k','LineWidth',lw)
plot([0,length(changedleaforder_bygroups_1)+1],(sum(refreceptors.typeid(refreceptors.tocheckseq(refreceptors.optleaforder))==2)+0.5)*[1,1],'w')
plot([0,length(changedleaforder_bygroups_1)+1],(sum(refreceptors.typeid(refreceptors.tocheckseq(refreceptors.optleaforder))>0)+0.5)*[1,1],'w');
xlim([0,length(changedleaforder_bygroups_1)+1])
title(sprintf('threshold %0.2f',test_distthresh))
%remove lab strains from the data, that's for the "sequence-to-ecology"
if length(pickedrec_fpva.candself)>4547
pickedrec_fpva_1=pickedrec_fpva;
pickedrec_fpva.speciesid=pickedrec_fpva_1.speciesid(120:end);
pickedrec_fpva.candself=pickedrec_fpva_1.candself(120:end);
pickedrec_fpva.fpvA_hmm_scorem=pickedrec_fpva_1.fpvA_hmm_scorem(120:end,:);
pickedrec_fpva.genekind=pickedrec_fpva_1.genekind(120:end);
pickedrec_fpva.locustag=pickedrec_fpva_1.locustag(120:end);
pickedrec_fpva.matchtoknown=pickedrec_fpva_1.matchtoknown(120:end);
pickedrec_fpva.mindist=pickedrec_fpva_1.mindist(120:end);
pickedrec_fpva.num=length(pickedrec_fpva.speciesid);
pickedrec_fpva.orderinpick=pickedrec_fpva_1.orderinpick(120:end);
pickedrec_fpva.rec_hmm_scorem=pickedrec_fpva_1.rec_hmm_scorem(120:end,:);
pickedrec_fpva.recs_dist_threetypes=pickedrec_fpva_1.recs_dist_threetypes(120:end,:);
pickedrec_fpva.recs_isminto_threetypes=pickedrec_fpva_1.recs_isminto_threetypes(120:end,:);
pickedrec_fpva.seqlen=pickedrec_fpva_1.seqlen(120:end);
pickedrec_fpva.seqs=pickedrec_fpva_1.seqs(120:end);
pickedrec_fpva.uninames=pickedrec_fpva_1.uninames(120:end);
%Find strains with the shortest genomic distance between receptor and synthesizetase in 1664 strains of bacteria
spname=[taxt.producer;taxt.partial_producer];
if size(mindist_toref,2)>4547
mindist_toref_1=mindist_toref;
mindist_toref=mindist_toref_1(120:end);
a=find(ismember(pickedrec_fpva.uninames,name1));
temp=pickedrec_fpva.recs_dist_threetypes(a,2:3);
mindist(i).toPep=min( temp(:,1));
mindist(i).toPeprecnum=find(temp(:,1)==min( temp(:,1)));
mindist(i).Pepmindisttoref=temp_1(mindist(i).toPeprecnum);
mindist(i).toTE=min( temp(:,2));
mindist(i).toTErecnum=find(temp(:,2)==min( temp(:,2)));
mindist(i).TEmindisttoref=temp_1(mindist(i).toTErecnum);
mindist_toPep=cell2table({mindist.toPep}');
mindist_toTE=cell2table({mindist.toTE}');
mindist_Pepmindisttoref=cell2table({mindist.Pepmindisttoref}');
mindist_TEmindisttoref=cell2table({mindist.TEmindisttoref}');
a=find(isnan(mindist_toPep.Var1));
b=find(~isnan(mindist_toPep.Var1));
c=find(isnan(mindist_toTE.Var1));
d=find(~isnan(mindist_toTE.Var1));
data1=mindist_toTE.Var1(a1);
data1_disttoref=cell2table(mindist_TEmindisttoref.Var1(a1));
data2=mindist_toPep.Var1(b1);
data2_disttoref=cell2table(mindist_Pepmindisttoref.Var1(b1));
data3_1=mindist_toTE.Var1(c1);
data3_2=mindist_toPep.Var1(c1);
data3_disttoref=cell2table(mindist_TEmindisttoref.Var1(c1));
data1_1=[data1;data_toTe];
data1_1_disttoref=[data1_disttoref.Var1;data3_disttoref.Var1(c)];
data3_1_disttoref=data3_disttoref.Var1(c1);
if data3_1_1(i)>data3_2_1(i)
data1_1(data1_1>20000)=25000;
data2(data2>20000)=25000;
data3(data3>20000)=25000;
FpvA=[data1_1;data2;data3];
data_disttoref=[data1_1_disttoref;data2_disttoref.Var1;data3_1_disttoref];
hasmindisttoref_1=(data_disttoref<mintoref_thresh);
hasmindisttoref_2=hasmindisttoref_1;
cand=find(hasmindisttoref_2==1);titlestr='has ref';
datatoshow=FpvA(cand);boarders=3*[0,1]*10^4;
[bheight1,barloc1]=hist(datatoshow,linspace(boarders(1),boarders(2),barnum));
cand=find(hasmindisttoref_2==0);titlestr='no ref';
datatoshow=FpvA(cand);boarders=3*[0,1]*10^4;
[bheight2,barloc2]=hist(datatoshow,linspace(boarders(1),boarders(2),barnum));
data1_gsh=bheight1+bheight2;
data2_gsh=log10(data1_gsh+1);
data_gsh=[log10(bheight1+1);log10(bheight2+1)];
data_gsh=[((log10(bheight1+1) ./ (log10(bheight1+1)+log10(bheight2+1))).* data2_gsh);((log10(bheight2+1) ./ (log10(bheight1+1)+log10(bheight2+1))).* data2_gsh)]
1.4842 1.1228 1.5108 1.1025 0.6792 0.7263 0.6373 1.0414 0.8736 0.5148 0.8372 0.4771 NaN 0.6990 NaN 0.2386 0.3010 NaN 0.3010 NaN 0 0 0 0.6021 NaN NaN 0.3010 NaN 0.4771 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.6344 NaN NaN NaN NaN NaN NaN NaN NaN
1.2693 1.2938 1.1687 0.8798 0.7010 0.4199 0.6637 0 0.4274 0.6313 0.2420 0 NaN 0 NaN 0.2386 0 NaN 0 NaN 0.4771 0.6990 0.3010 0 NaN NaN 0 NaN 0 0.7782 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.7806 NaN NaN NaN NaN NaN NaN NaN NaN
colors = [247/256 242/256 62/256; 0 255/256 255/256];
h=bar(barloc1,data_gsh',0.8,'stacked');
xlabel('Genimic distance between receotor and synthetase (bp)');
legend('Similar to references', 'Dissimilar to references', 'Location', 'northeast');
reces_has_distototype3=find(~isnan(pickedrec_fpva.recs_dist_threetypes(:,3)) );
reces_has_distototype2=find(~isnan(pickedrec_fpva.recs_dist_threetypes(:,2)) );
reces_has_distototype2_3=union(reces_has_distototype2,reces_has_distototype3);
recs_candself_3=find(abs(pickedrec_fpva.recs_dist_threetypes(:,3))<thresgenomedist);
recs_candself_3=intersect(recs_candself_3,reces_has_distototype3);
recs_candself_2=find(abs(pickedrec_fpva.recs_dist_threetypes(:,2))<thresgenomedist);
recs_candself_2=intersect(recs_candself_2,reces_has_distototype2);
recs_candself=union(recs_candself_2,recs_candself_3);
for i=1:length(recs_candself)
pickedrec_fpva.proxirec(rid,1)=1;
if length(pickedrec_fpva.proxirec)<4547
pickedrec_fpva.proxirec(end+1)=0;
[~,fastaseq]=fastaread('./data/fpva_4666_aligned.fasta');
for i=120:length(fastaseq)
fpvas_aligned_6047(num,:)=fastaseq{i};
if size(refdistm_fpva,2)>4547
refdistm_fpva_1=refdistm_fpva;
refdistm_fpva=refdistm_fpva_1(:,120:end);
surerec=find(pickedrec_fpva.proxirec==1);
pao1rec=find(refdistm_fpva(1,:)<0.0013,1);
surerec=[pao1rec;surerec];
surerecseq_align=fpvas_aligned_6047(surerec,:);
pao1rec_locinsurerecseq_align=1;
load ./data/seqdist4666.mat;
if size(seqdist4666,1)>4547
seqdist4666_1=seqdist4666;
seqdist4666=seqdist4666_1(120:end,120:end);
seqdist1231=seqdist4666(surerec,surerec);
fpvaseqdist=linkage(seqdist1231,'single');
optleaforderfpva = optimalleaforder(fpvaseqdist,seqdist1231);
[node_groupid,totalgnum,changedleaforder_bygroups] = Cluster_byconnection(seqdist1231,test_distthresh,optleaforderfpva);
suregroup=unique(node_groupid);
pao1loc=pao1rec_locinsurerecseq_align;
sele_mualign=surerecseq_align;
[sele_CSeq, sele_Score] = seqconsensus(sele_mualign);
gapfreq=sum(sele_mualign=='-')/size(sele_mualign,1);
conservescore=(max(sele_Score)-sele_Score).*(1-gapfreq);
existinPao1=find(sele_mualign(pao1rec_locinsurerecseq_align,:)~='-' & sele_mualign(pao1rec_locinsurerecseq_align,:)~='*');
mullocinpao1=cumsum(sele_mualign(pao1rec_locinsurerecseq_align,:)~='-'& sele_mualign(pao1rec_locinsurerecseq_align,:)~='*');
freq=tabulate(node_groupid);
% the reduced entropy along the group
mul_int=aa2int(sele_mualign);
mulinfor_group=zeros(1,size(sele_mualign,2));
for i=1:size(sele_mualign,2)
[H1,H2,CH1_given2,CH2_given1,mulin]=ZL_Mulinfor_int(mul_int(:,i),node_groupid);
mulinfor_group(i)=mulin/H2;
threshhold_conservescore=0.95;
templist=find(conservescore>threshhold_conservescore*max(conservescore));
% where are the three domains
seperateposi=templist(1);
for i=2:(length(templist)-1)
if templist(i)-templist(i-1)>15
seperateposi(end+1)=templist(i);
seperateposi=unique([1,seperateposi,length(conservescore)]);
sep_se=[seperateposi(1:(end-1))',seperateposi(2:end)'-1];
sep_se(end,2)=length(sele_Score);
clustering_score=zeros(1,size(sep_se,1));
unigs=unique(node_groupid);
scorem=squareform(seqpdist(sele_mualign(:,sp:ep),'Method','p-distance'));
dist_between=zeros(length(unigs));
std_within=zeros(1,length(unigs));
members_1=find(node_groupid==unigs(g1));
arrange=[arrange,members_1];
members_2=find(node_groupid==unigs(g2));
tempdistm=scorem(members_1,members_2);
dataline=squareform(tempdistm);
std_within(g1)=mean(dataline);
dist_between(g1,g2)=mean(dataline);
dist_between(g2,g1)=dist_between(g1,g2);
weighted_between=dist_between;
for k=1:size(weighted_between,1)
weighted_between(k,k)=nan;
clustering_twoscore(i,:)=[mean(weighted_between(:),'omitnan'),mean(std_within,'omitnan')];
clustering_score(i)=mean(weighted_between(:),'omitnan')/mean(std_within,'omitnan');
colortable=colormap(jet);
subplot(subplotnum,1,1:(subplotnum-1))
xlist=1:length(existinPao1);
plot(xlist,conservescore(existinPao1),'k');
scatter(mullocinpao1(seperateposi),conservescore(seperateposi),30,'k','filled')
mincolorv=min(clustering_score)%3;
maxcolorv=max(clustering_score)%15;
splist=intersect(sp:ep,existinPao1);
% how many can be traced into the first 21
cscore=clustering_score(i);
colorid=min(size(colortable,1),max(1,round(size(colortable,1)*(cscore-mincolorv)/(maxcolorv-mincolorv))));
fh=fill(mullocinpao1([splist(1),splist,splist(end)]),[0,conservescore(splist),0],colortable(colorid,:));
% mark the poisition for domains
refreceptors.pao1_ses=[1,43;
refreceptors.annostr={'Signaling Sequence';
refreceptors.color=[0.5,0.5,0.5;
for i=1:size(refreceptors.pao1_ses)
startp=refreceptors.pao1_ses(i,1);
stopp=refreceptors.pao1_ses(i,2);
colorcode=refreceptors.color(i,:);
annostr=refreceptors.annostr{i};
patch([startp,stopp,stopp,startp],-0.1+[-1,-1,0,0]*height,colorcode);
text(startp,-0.5,annostr)
ch.TickLabels= strsplit(sprintf('%0.1f ',[mincolorv,maxcolorv]));
xlim([0,length(existinPao1)+1])
ylim([-1.2,max(conservescore)+0.2])
ylabel('conservation score');
subplot(subplotnum,1,subplotnum)
xlist=1:length(existinPao1);
plot(xlist,mulinfor_group(existinPao1))
xlim([0,length(existinPao1)+1])
distm_rec=rec.seqdist678910;
imagesc(distm_rec(leaforder_alonesp_new,leaforder_alonesp_new));
rec.straintype(1:986,1)=1;
cheater1name=find(strcmp(rec.strainnames,taxt.cheater(1)));
rec.straintype(987:(cheater1name(1)-1),1)=2;
rec.straintype(cheater1name(1):rec.datanum,1)=3;
groupsize=zeros(1,totalgnum_new);
group_typebelong=zeros(3,totalgnum_new);
group_selfnum=zeros(1,totalgnum_new);
members_rec=find(recgroup_new==g);
members_straintype=rec.straintype(members_rec);
group_typebelong(:,g)=[sum(members_straintype==1),sum(members_straintype==2),sum(members_straintype==3)];
group_selfnum(g)=sum(rec.candyself(members_rec));
thisg_members=find(recgroup_new(leaforder_alonesp_new)==g);
groupsize(g)=length(thisg_members);
plot([0,length(leaforder_alonesp_new)],(thisg_members(end)+1)*[1,1],'k','LineWidth',lw);
plot((thisg_members(end)+1)*[1,1],[0,length(leaforder_alonesp_new)],'k','LineWidth',lw);
groupassigncolor=find(group_typebelong(1,:)>0);
[~,order]=sort(group_typebelong(1,groupassigncolor));
members_rec=find(recgroup_new==g);
locinassign=find(groupassigncolor==g);
members_rec=find(recgroup_new==g);
members_straintype=rec.straintype(members_rec);
members_rec_pure=members_rec(members_straintype==1);
locsinorder=sort(find(ismember(leaforder_alonesp_new,members_rec_pure)));
for j=1:length(locsinorder)
if j==length(locsinorder)||(j>1 && ((locsinorder(j)-locsinorder(j-1))>1))'
setables(end+1,:)=[s-0.5,e+0.5];
xlim([0,length(leaforder_alonesp_new)+1]);
phygroup_syz=readtable('./data/phygroup_0308_syndist.xlsx');
警告: 在为表创建变量名称之前,对文件中的列标题进行了修改,以使其成为有效的 MATLAB 标识符。原始列标题保存在 VariableDescriptions 属性中。
将 'VariableNamingRule' 设置为 'preserve' 以使用原始列标题作为表变量名称。
phygroup=phygroup_syz.phygroup0_3_18_;
phygroup_above2=find(aaa(:,2)>1);
digital=readtable('./data/pyoverdine_digital_1664.xlsx');
for i=1:length(phygroup_syz.Species)
a=ismember(rec.strainnames,phygroup_syz.Species{i});
recgroup_temp=recgroup_new(a);
sp1928(i).spanme=phygroup_syz.Species{i};
sp1928(i).phyg=phygroup(i);
sp1928(i).recg=sort(recgroup_temp);
sp1928(i).len=length(recgroup_temp);
tem_char = [tem_char,'-',num2str(recnum(i,j))];
sp1928(i).reccomb=tem_char;
H_score=100*ones(length(aaa),3);
H_score(:,1)=1:length(aaa);
for i=1:length(phygroup_above2)
temp_reccomb_phyg1={sp1928.reccomb};
temp_reccomb_phyg2=temp_reccomb_phyg1(a);
b_phyg2=tabulate(temp_reccomb_phyg2);
p_phy=cell2table(b_phyg2(:,3));
H_phy1=-log2(p_phy.Var1/100);
H_phy2=(p_phy.Var1/100) .* H_phy1;
temp_rec_phyg1=[sp1928(a).recg];
b_recg=tabulate(temp_rec_phyg1);
b_temp=find(b_recg(:,2)>0);
b_recg2=b_recg(b_temp,:);
p_recg2=b_recg2(:,3)/100;
H_recg2= p_recg2 .* H_recg1;
H_score(phyg,2)=H_reccomb;
sp1928(i).Hreccomb=H_score(g1,2);
sp1928(i).Hrecg=H_score(g1,3);
data=readtable('./data/Rec_div_0309.xlsx');
domainsp=["aeruginosa","fluorescens","syringae","putida","sp"];
a=find(data.phygroup==i);
c=find(ismember(b(:,1),domainsp));
d=find(~ismember(b(:,1),domainsp));
usedata(num).spname=b(c(j),1);
temp1=cell2table(b(c(j),2));
usedata(num).num=temp1.Var1;
usedata(num).spname='Other';
usedata(num).num=sum(f.Var1);
color6=[202/255,39/255,30/255;110/255,179/255,191/255;216/255,117/255,36/255;61/255,115/255,60/255;16/255,66/255,135/255;125/255,125/255,128/255];
color_matrix=zeros(length(usedata),3);
if strcmp(usedata(i).spname,"aeruginosa")
color_matrix(i,:)=color6(1,:);
elseif strcmp(usedata(i).spname,"fluorescens")
color_matrix(i,:)=color6(2,:);
elseif strcmp(usedata(i).spname,"syringae")
color_matrix(i,:)=color6(3,:);
elseif strcmp(usedata(i).spname,"putida")
color_matrix(i,:)=color6(4,:);
elseif strcmp(usedata(i).spname,"sp")
color_matrix(i,:)=color6(5,:);
elseif strcmp(usedata(i).spname,"Other")
color_matrix(i,:)=color6(6,:);
value_phyg=readtable('./data/value_phygroup_0309.xlsx');
value_1=table2array(value_phyg);
c=find(~isnan(X((i-1),:)));
b=bar(i:i+1,[X(i,1:length(a));zeros(1,length(a))],0.75,'stacked');
set(b(j),'facecolor',color_matrix(num+j,:))
b=bar(i:i+1,[X(i,1:length(a));zeros(1,length(a))],0.75,'stacked');
set(b(j),'facecolor',color_matrix(j,:))
usephygroup=value_1(:,1);
for i=1:size(usephygroup,1)
a=find(data.phygroup==g1);
H_recg_temp=data.H_recg(a);
H_recg_use(i,1)=H_recg_temp(1);
for s=1:length(H_recg_use)
text(xloc,yloc,sprintf('%.1f',data_recdiv(r,s)),'HorizontalAlignment', 'center')
leaforder_alonesp=changedleaforder_bygroups;
groupfreq=tabulate(recgroup);
rec_groupsize=groupfreq(:,2);
[~,recgroup_orderbysize]=sort(rec_groupsize,'descend');
toplotm=seqdist1231(leaforder_alonesp,leaforder_alonesp);
scorem=squareform(seqpdist(sele_mualign(:,sep_se(5,1):sep_se(8,2)),'Method','p-distance'));
toplotm=scorem(leaforder_alonesp,leaforder_alonesp);
titlestr='d(rec_feature)';
average_m=zeros(totalgnum);
thisg_members=find(recgroup(leaforder_alonesp)==g);
rg_member=find(recgroup(leaforder_alonesp)==r);
tempm=toplotm((thisg_members),(rg_member));
average_m(g,r)=mean(tempm(:));
average_m(r,g)=average_m(g,r);
if length(thisg_members)>groupsize_thresh
text(-80, mean(thisg_members),num2str(g));
xlim([-50,length(leaforder_alonesp)+1])
xx=distm_rec(leaforder_alonesp,leaforder_alonesp);
tokeep=(~isnan(x) & ~isnan(y));
corm=corrcoef(x(tokeep),y(tokeep));
% distribution in bacteria
raw=importdata('./data/FpvA_in_bacteria.xlsx');
fpvAdata=raw.textdata.origin(2:end,:);
fpvA_columnid.ispathogen=3;
fpvA_columnid.species=11;
%distinguishable_colors(10,'w');
phylo_id=zeros(size(fpvAdata,1),length(6:11));
% assign each level an ID, that in descending order
[uni_strs,~,ids]=unique(fpvAdata(:,i+5));
% measure the counts of each class
str_count(j)=sum(ids==j);
[value,order]=sort(str_count);
levelinfor(i).uni_strs=uni_strs(order);
levelinfor(i).str_count=str_count(order);
previoustonow(order)=1:length(order);
phylo_id(:,i)=previoustonow(ids)';
% then, order all strains by their column ids
[uni_rows,~,strain_id_inunirow]=unique(phylo_id,'rows');
% ph=patch(1+6*[0 1 1 0],[0,0,1,1],[1,1,1]*0.8);ph.LineStyle='none';
species_couont=species_couont+sum(strain_id_inunirow==j);
if j==size(uni_rows,1)||(uni_rows(j+1,i)~= uni_rows(lastid,i))
boxpercent=species_couont/size(strain_id_inunirow,1);
if boxpercent>toplotthresh % plot out only under conditions
rows_sameprevious=find(uni_rows(:,i-1)==uni_rows(j,i-1));
% how many different current for these
[unicur,~,currentids]=unique(uni_rows(rows_sameprevious,i));
thislevelcount=max(currentids);
colorid=find(uni_rows(j,i)==unicur);
colortable=distinguishable_colors(thislevelcount);
ph=patch(i+[0,1,1,0],previouspercent+boxpercent*[0,0,1,1],colortable(colorid,:));
if boxpercent>textthreshold
strtoshow=levelinfor(i).uni_strs{uni_rows(lastid,i)};
strlist=strsplit(strtoshow);
strtoshow=[strlist{1}(1),'.',strlist{2}];
text(i,previouspercent+0.5*boxpercent,strtoshow);
previouspercent=previouspercent+boxpercent;
% plot the last one, if it takes certain space
previouspercent+boxpercent;
set(gca,'ytick',[],'xtick',[]);